%%js
/** vim: et:ts=4:sw=4:sts=4
* @license RequireJS 2.3.6 Copyright jQuery Foundation and other contributors.
* Released under MIT license, https://github.com/requirejs/requirejs/blob/master/LICENSE
*/
var requirejs,require,define;!function(global,setTimeout){var req,s,head,baseElement,dataMain,src,interactiveScript,currentlyAddingScript,mainScript,subPath,version="2.3.6",commentRegExp=/\/\*[\s\S]*?\*\/|([^:"'=]|^)\/\/.*$/gm,cjsRequireRegExp=/[^.]\s*require\s*\(\s*["']([^'"\s]+)["']\s*\)/g,jsSuffixRegExp=/\.js$/,currDirRegExp=/^\.\//,op=Object.prototype,ostring=op.toString,hasOwn=op.hasOwnProperty,isBrowser=!("undefined"==typeof window||"undefined"==typeof navigator||!window.document),isWebWorker=!isBrowser&&"undefined"!=typeof importScripts,readyRegExp=isBrowser&&"PLAYSTATION 3"===navigator.platform?/^complete$/:/^(complete|loaded)$/,defContextName="_",isOpera="undefined"!=typeof opera&&"[object Opera]"===opera.toString(),contexts={},cfg={},globalDefQueue=[],useInteractive=!1;function commentReplace(e,t){return t||""}function isFunction(e){return"[object Function]"===ostring.call(e)}function isArray(e){return"[object Array]"===ostring.call(e)}function each(e,t){var i;if(e)for(i=0;i<e.length&&(!e[i]||!t(e[i],i,e));i+=1);}function eachReverse(e,t){var i;if(e)for(i=e.length-1;-1<i&&(!e[i]||!t(e[i],i,e));i-=1);}function hasProp(e,t){return hasOwn.call(e,t)}function getOwn(e,t){return hasProp(e,t)&&e[t]}function eachProp(e,t){var i;for(i in e)if(hasProp(e,i)&&t(e[i],i))break}function mixin(i,e,r,n){return e&&eachProp(e,function(e,t){!r&&hasProp(i,t)||(!n||"object"!=typeof e||!e||isArray(e)||isFunction(e)||e instanceof RegExp?i[t]=e:(i[t]||(i[t]={}),mixin(i[t],e,r,n)))}),i}function bind(e,t){return function(){return t.apply(e,arguments)}}function scripts(){return document.getElementsByTagName("script")}function defaultOnError(e){throw e}function getGlobal(e){if(!e)return e;var t=global;return each(e.split("."),function(e){t=t[e]}),t}function makeError(e,t,i,r){var n=new Error(t+"\nhttps://requirejs.org/docs/errors.html#"+e);return n.requireType=e,n.requireModules=r,i&&(n.originalError=i),n}if(void 0===define){if(void 0!==requirejs){if(isFunction(requirejs))return;cfg=requirejs,requirejs=void 0}void 0===require||isFunction(require)||(cfg=require,require=void 0),req=requirejs=function(e,t,i,r){var n,o,a=defContextName;return isArray(e)||"string"==typeof e||(o=e,isArray(t)?(e=t,t=i,i=r):e=[]),o&&o.context&&(a=o.context),(n=getOwn(contexts,a))||(n=contexts[a]=req.s.newContext(a)),o&&n.configure(o),n.require(e,t,i)},req.config=function(e){return req(e)},req.nextTick=void 0!==setTimeout?function(e){setTimeout(e,4)}:function(e){e()},require||(require=req),req.version=version,req.jsExtRegExp=/^\/|:|\?|\.js$/,req.isBrowser=isBrowser,s=req.s={contexts:contexts,newContext:newContext},req({}),each(["toUrl","undef","defined","specified"],function(t){req[t]=function(){var e=contexts[defContextName];return e.require[t].apply(e,arguments)}}),isBrowser&&(head=s.head=document.getElementsByTagName("head")[0],baseElement=document.getElementsByTagName("base")[0],baseElement&&(head=s.head=baseElement.parentNode)),req.onError=defaultOnError,req.createNode=function(e,t,i){var r=e.xhtml?document.createElementNS("http://www.w3.org/1999/xhtml","html:script"):document.createElement("script");return r.type=e.scriptType||"text/javascript",r.charset="utf-8",r.async=!0,r},req.load=function(t,i,r){var e,n=t&&t.config||{};if(isBrowser)return(e=req.createNode(n,i,r)).setAttribute("data-requirecontext",t.contextName),e.setAttribute("data-requiremodule",i),!e.attachEvent||e.attachEvent.toString&&e.attachEvent.toString().indexOf("[native code")<0||isOpera?(e.addEventListener("load",t.onScriptLoad,!1),e.addEventListener("error",t.onScriptError,!1)):(useInteractive=!0,e.attachEvent("onreadystatechange",t.onScriptLoad)),e.src=r,n.onNodeCreated&&n.onNodeCreated(e,n,i,r),currentlyAddingScript=e,baseElement?head.insertBefore(e,baseElement):head.appendChild(e),currentlyAddingScript=null,e;if(isWebWorker)try{setTimeout(function(){},0),importScripts(r),t.completeLoad(i)}catch(e){t.onError(makeError("importscripts","importScripts failed for "+i+" at "+r,e,[i]))}},isBrowser&&!cfg.skipDataMain&&eachReverse(scripts(),function(e){if(head||(head=e.parentNode),dataMain=e.getAttribute("data-main"))return mainScript=dataMain,cfg.baseUrl||-1!==mainScript.indexOf("!")||(mainScript=(src=mainScript.split("/")).pop(),subPath=src.length?src.join("/")+"/":"./",cfg.baseUrl=subPath),mainScript=mainScript.replace(jsSuffixRegExp,""),req.jsExtRegExp.test(mainScript)&&(mainScript=dataMain),cfg.deps=cfg.deps?cfg.deps.concat(mainScript):[mainScript],!0}),define=function(e,i,t){var r,n;"string"!=typeof e&&(t=i,i=e,e=null),isArray(i)||(t=i,i=null),!i&&isFunction(t)&&(i=[],t.length&&(t.toString().replace(commentRegExp,commentReplace).replace(cjsRequireRegExp,function(e,t){i.push(t)}),i=(1===t.length?["require"]:["require","exports","module"]).concat(i))),useInteractive&&(r=currentlyAddingScript||getInteractiveScript())&&(e||(e=r.getAttribute("data-requiremodule")),n=contexts[r.getAttribute("data-requirecontext")]),n?(n.defQueue.push([e,i,t]),n.defQueueMap[e]=!0):globalDefQueue.push([e,i,t])},define.amd={jQuery:!0},req.exec=function(text){return eval(text)},req(cfg)}function newContext(u){var i,e,l,c,d,g={waitSeconds:7,baseUrl:"./",paths:{},bundles:{},pkgs:{},shim:{},config:{}},p={},f={},r={},h=[],m={},n={},v={},x=1,b=1;function q(e,t,i){var r,n,o,a,s,u,c,d,p,f,l=t&&t.split("/"),h=g.map,m=h&&h["*"];if(e&&(u=(e=e.split("/")).length-1,g.nodeIdCompat&&jsSuffixRegExp.test(e[u])&&(e[u]=e[u].replace(jsSuffixRegExp,"")),"."===e[0].charAt(0)&&l&&(e=l.slice(0,l.length-1).concat(e)),function(e){var t,i;for(t=0;t<e.length;t++)if("."===(i=e[t]))e.splice(t,1),t-=1;else if(".."===i){if(0===t||1===t&&".."===e[2]||".."===e[t-1])continue;0<t&&(e.splice(t-1,2),t-=2)}}(e),e=e.join("/")),i&&h&&(l||m)){e:for(o=(n=e.split("/")).length;0<o;o-=1){if(s=n.slice(0,o).join("/"),l)for(a=l.length;0<a;a-=1)if((r=getOwn(h,l.slice(0,a).join("/")))&&(r=getOwn(r,s))){c=r,d=o;break e}!p&&m&&getOwn(m,s)&&(p=getOwn(m,s),f=o)}!c&&p&&(c=p,d=f),c&&(n.splice(0,d,c),e=n.join("/"))}return getOwn(g.pkgs,e)||e}function E(t){isBrowser&&each(scripts(),function(e){if(e.getAttribute("data-requiremodule")===t&&e.getAttribute("data-requirecontext")===l.contextName)return e.parentNode.removeChild(e),!0})}function w(e){var t=getOwn(g.paths,e);if(t&&isArray(t)&&1<t.length)return t.shift(),l.require.undef(e),l.makeRequire(null,{skipMap:!0})([e]),!0}function y(e){var t,i=e?e.indexOf("!"):-1;return-1<i&&(t=e.substring(0,i),e=e.substring(i+1,e.length)),[t,e]}function S(e,t,i,r){var n,o,a,s,u=null,c=t?t.name:null,d=e,p=!0,f="";return e||(p=!1,e="_@r"+(x+=1)),u=(s=y(e))[0],e=s[1],u&&(u=q(u,c,r),o=getOwn(m,u)),e&&(u?f=i?e:o&&o.normalize?o.normalize(e,function(e){return q(e,c,r)}):-1===e.indexOf("!")?q(e,c,r):e:(u=(s=y(f=q(e,c,r)))[0],f=s[1],i=!0,n=l.nameToUrl(f))),{prefix:u,name:f,parentMap:t,unnormalized:!!(a=!u||o||i?"":"_unnormalized"+(b+=1)),url:n,originalName:d,isDefine:p,id:(u?u+"!"+f:f)+a}}function k(e){var t=e.id,i=getOwn(p,t);return i||(i=p[t]=new l.Module(e)),i}function M(e,t,i){var r=e.id,n=getOwn(p,r);!hasProp(m,r)||n&&!n.defineEmitComplete?(n=k(e)).error&&"error"===t?i(n.error):n.on(t,i):"defined"===t&&i(m[r])}function O(i,e){var t=i.requireModules,r=!1;e?e(i):(each(t,function(e){var t=getOwn(p,e);t&&(t.error=i,t.events.error&&(r=!0,t.emit("error",i)))}),r||req.onError(i))}function j(){globalDefQueue.length&&(each(globalDefQueue,function(e){var t=e[0];"string"==typeof t&&(l.defQueueMap[t]=!0),h.push(e)}),globalDefQueue=[])}function P(e){delete p[e],delete f[e]}function R(){var e,r,t=1e3*g.waitSeconds,n=t&&l.startTime+t<(new Date).getTime(),o=[],a=[],s=!1,u=!0;if(!i){if(i=!0,eachProp(f,function(e){var t=e.map,i=t.id;if(e.enabled&&(t.isDefine||a.push(e),!e.error))if(!e.inited&&n)w(i)?s=r=!0:(o.push(i),E(i));else if(!e.inited&&e.fetched&&t.isDefine&&(s=!0,!t.prefix))return u=!1}),n&&o.length)return(e=makeError("timeout","Load timeout for modules: "+o,null,o)).contextName=l.contextName,O(e);u&&each(a,function(e){!function n(o,a,s){var e=o.map.id;o.error?o.emit("error",o.error):(a[e]=!0,each(o.depMaps,function(e,t){var i=e.id,r=getOwn(p,i);!r||o.depMatched[t]||s[i]||(getOwn(a,i)?(o.defineDep(t,m[i]),o.check()):n(r,a,s))}),s[e]=!0)}(e,{},{})}),n&&!r||!s||!isBrowser&&!isWebWorker||d||(d=setTimeout(function(){d=0,R()},50)),i=!1}}function a(e){hasProp(m,e[0])||k(S(e[0],null,!0)).init(e[1],e[2])}function o(e,t,i,r){e.detachEvent&&!isOpera?r&&e.detachEvent(r,t):e.removeEventListener(i,t,!1)}function s(e){var t=e.currentTarget||e.srcElement;return o(t,l.onScriptLoad,"load","onreadystatechange"),o(t,l.onScriptError,"error"),{node:t,id:t&&t.getAttribute("data-requiremodule")}}function T(){var e;for(j();h.length;){if(null===(e=h.shift())[0])return O(makeError("mismatch","Mismatched anonymous define() module: "+e[e.length-1]));a(e)}l.defQueueMap={}}return c={require:function(e){return e.require?e.require:e.require=l.makeRequire(e.map)},exports:function(e){if(e.usingExports=!0,e.map.isDefine)return e.exports?m[e.map.id]=e.exports:e.exports=m[e.map.id]={}},module:function(e){return e.module?e.module:e.module={id:e.map.id,uri:e.map.url,config:function(){return getOwn(g.config,e.map.id)||{}},exports:e.exports||(e.exports={})}}},(e=function(e){this.events=getOwn(r,e.id)||{},this.map=e,this.shim=getOwn(g.shim,e.id),this.depExports=[],this.depMaps=[],this.depMatched=[],this.pluginMaps={},this.depCount=0}).prototype={init:function(e,t,i,r){r=r||{},this.inited||(this.factory=t,i?this.on("error",i):this.events.error&&(i=bind(this,function(e){this.emit("error",e)})),this.depMaps=e&&e.slice(0),this.errback=i,this.inited=!0,this.ignore=r.ignore,r.enabled||this.enabled?this.enable():this.check())},defineDep:function(e,t){this.depMatched[e]||(this.depMatched[e]=!0,this.depCount-=1,this.depExports[e]=t)},fetch:function(){if(!this.fetched){this.fetched=!0,l.startTime=(new Date).getTime();var e=this.map;if(!this.shim)return e.prefix?this.callPlugin():this.load();l.makeRequire(this.map,{enableBuildCallback:!0})(this.shim.deps||[],bind(this,function(){return e.prefix?this.callPlugin():this.load()}))}},load:function(){var e=this.map.url;n[e]||(n[e]=!0,l.load(this.map.id,e))},check:function(){if(this.enabled&&!this.enabling){var t,e,i=this.map.id,r=this.depExports,n=this.exports,o=this.factory;if(this.inited){if(this.error)this.emit("error",this.error);else if(!this.defining){if(this.defining=!0,this.depCount<1&&!this.defined){if(isFunction(o)){if(this.events.error&&this.map.isDefine||req.onError!==defaultOnError)try{n=l.execCb(i,o,r,n)}catch(e){t=e}else n=l.execCb(i,o,r,n);if(this.map.isDefine&&void 0===n&&((e=this.module)?n=e.exports:this.usingExports&&(n=this.exports)),t)return t.requireMap=this.map,t.requireModules=this.map.isDefine?[this.map.id]:null,t.requireType=this.map.isDefine?"define":"require",O(this.error=t)}else n=o;if(this.exports=n,this.map.isDefine&&!this.ignore&&(m[i]=n,req.onResourceLoad)){var a=[];each(this.depMaps,function(e){a.push(e.normalizedMap||e)}),req.onResourceLoad(l,this.map,a)}P(i),this.defined=!0}this.defining=!1,this.defined&&!this.defineEmitted&&(this.defineEmitted=!0,this.emit("defined",this.exports),this.defineEmitComplete=!0)}}else hasProp(l.defQueueMap,i)||this.fetch()}},callPlugin:function(){var u=this.map,c=u.id,e=S(u.prefix);this.depMaps.push(e),M(e,"defined",bind(this,function(e){var o,t,i,r=getOwn(v,this.map.id),n=this.map.name,a=this.map.parentMap?this.map.parentMap.name:null,s=l.makeRequire(u.parentMap,{enableBuildCallback:!0});return this.map.unnormalized?(e.normalize&&(n=e.normalize(n,function(e){return q(e,a,!0)})||""),M(t=S(u.prefix+"!"+n,this.map.parentMap,!0),"defined",bind(this,function(e){this.map.normalizedMap=t,this.init([],function(){return e},null,{enabled:!0,ignore:!0})})),void((i=getOwn(p,t.id))&&(this.depMaps.push(t),this.events.error&&i.on("error",bind(this,function(e){this.emit("error",e)})),i.enable()))):r?(this.map.url=l.nameToUrl(r),void this.load()):((o=bind(this,function(e){this.init([],function(){return e},null,{enabled:!0})})).error=bind(this,function(e){this.inited=!0,(this.error=e).requireModules=[c],eachProp(p,function(e){0===e.map.id.indexOf(c+"_unnormalized")&&P(e.map.id)}),O(e)}),o.fromText=bind(this,function(e,t){var i=u.name,r=S(i),n=useInteractive;t&&(e=t),n&&(useInteractive=!1),k(r),hasProp(g.config,c)&&(g.config[i]=g.config[c]);try{req.exec(e)}catch(e){return O(makeError("fromtexteval","fromText eval for "+c+" failed: "+e,e,[c]))}n&&(useInteractive=!0),this.depMaps.push(r),l.completeLoad(i),s([i],o)}),void e.load(u.name,s,o,g))})),l.enable(e,this),this.pluginMaps[e.id]=e},enable:function(){(f[this.map.id]=this).enabled=!0,this.enabling=!0,each(this.depMaps,bind(this,function(e,t){var i,r,n;if("string"==typeof e){if(e=S(e,this.map.isDefine?this.map:this.map.parentMap,!1,!this.skipMap),this.depMaps[t]=e,n=getOwn(c,e.id))return void(this.depExports[t]=n(this));this.depCount+=1,M(e,"defined",bind(this,function(e){this.undefed||(this.defineDep(t,e),this.check())})),this.errback?M(e,"error",bind(this,this.errback)):this.events.error&&M(e,"error",bind(this,function(e){this.emit("error",e)}))}i=e.id,r=p[i],hasProp(c,i)||!r||r.enabled||l.enable(e,this)})),eachProp(this.pluginMaps,bind(this,function(e){var t=getOwn(p,e.id);t&&!t.enabled&&l.enable(e,this)})),this.enabling=!1,this.check()},on:function(e,t){var i=this.events[e];i||(i=this.events[e]=[]),i.push(t)},emit:function(e,t){each(this.events[e],function(e){e(t)}),"error"===e&&delete this.events[e]}},(l={config:g,contextName:u,registry:p,defined:m,urlFetched:n,defQueue:h,defQueueMap:{},Module:e,makeModuleMap:S,nextTick:req.nextTick,onError:O,configure:function(e){if(e.baseUrl&&"/"!==e.baseUrl.charAt(e.baseUrl.length-1)&&(e.baseUrl+="/"),"string"==typeof e.urlArgs){var i=e.urlArgs;e.urlArgs=function(e,t){return(-1===t.indexOf("?")?"?":"&")+i}}var r=g.shim,n={paths:!0,bundles:!0,config:!0,map:!0};eachProp(e,function(e,t){n[t]?(g[t]||(g[t]={}),mixin(g[t],e,!0,!0)):g[t]=e}),e.bundles&&eachProp(e.bundles,function(e,t){each(e,function(e){e!==t&&(v[e]=t)})}),e.shim&&(eachProp(e.shim,function(e,t){isArray(e)&&(e={deps:e}),!e.exports&&!e.init||e.exportsFn||(e.exportsFn=l.makeShimExports(e)),r[t]=e}),g.shim=r),e.packages&&each(e.packages,function(e){var t;t=(e="string"==typeof e?{name:e}:e).name,e.location&&(g.paths[t]=e.location),g.pkgs[t]=e.name+"/"+(e.main||"main").replace(currDirRegExp,"").replace(jsSuffixRegExp,"")}),eachProp(p,function(e,t){e.inited||e.map.unnormalized||(e.map=S(t,null,!0))}),(e.deps||e.callback)&&l.require(e.deps||[],e.callback)},makeShimExports:function(t){return function(){var e;return t.init&&(e=t.init.apply(global,arguments)),e||t.exports&&getGlobal(t.exports)}},makeRequire:function(o,a){function s(e,t,i){var r,n;return a.enableBuildCallback&&t&&isFunction(t)&&(t.__requireJsBuild=!0),"string"==typeof e?isFunction(t)?O(makeError("requireargs","Invalid require call"),i):o&&hasProp(c,e)?c[e](p[o.id]):req.get?req.get(l,e,o,s):(r=S(e,o,!1,!0).id,hasProp(m,r)?m[r]:O(makeError("notloaded",'Module name "'+r+'" has not been loaded yet for context: '+u+(o?"":". Use require([])")))):(T(),l.nextTick(function(){T(),(n=k(S(null,o))).skipMap=a.skipMap,n.init(e,t,i,{enabled:!0}),R()}),s)}return a=a||{},mixin(s,{isBrowser:isBrowser,toUrl:function(e){var t,i=e.lastIndexOf("."),r=e.split("/")[0];return-1!==i&&(!("."===r||".."===r)||1<i)&&(t=e.substring(i,e.length),e=e.substring(0,i)),l.nameToUrl(q(e,o&&o.id,!0),t,!0)},defined:function(e){return hasProp(m,S(e,o,!1,!0).id)},specified:function(e){return e=S(e,o,!1,!0).id,hasProp(m,e)||hasProp(p,e)}}),o||(s.undef=function(i){j();var e=S(i,o,!0),t=getOwn(p,i);t.undefed=!0,E(i),delete m[i],delete n[e.url],delete r[i],eachReverse(h,function(e,t){e[0]===i&&h.splice(t,1)}),delete l.defQueueMap[i],t&&(t.events.defined&&(r[i]=t.events),P(i))}),s},enable:function(e){getOwn(p,e.id)&&k(e).enable()},completeLoad:function(e){var t,i,r,n=getOwn(g.shim,e)||{},o=n.exports;for(j();h.length;){if(null===(i=h.shift())[0]){if(i[0]=e,t)break;t=!0}else i[0]===e&&(t=!0);a(i)}if(l.defQueueMap={},r=getOwn(p,e),!t&&!hasProp(m,e)&&r&&!r.inited){if(!(!g.enforceDefine||o&&getGlobal(o)))return w(e)?void 0:O(makeError("nodefine","No define call for "+e,null,[e]));a([e,n.deps||[],n.exportsFn])}R()},nameToUrl:function(e,t,i){var r,n,o,a,s,u,c=getOwn(g.pkgs,e);if(c&&(e=c),u=getOwn(v,e))return l.nameToUrl(u,t,i);if(req.jsExtRegExp.test(e))a=e+(t||"");else{for(r=g.paths,o=(n=e.split("/")).length;0<o;o-=1)if(s=getOwn(r,n.slice(0,o).join("/"))){isArray(s)&&(s=s[0]),n.splice(0,o,s);break}a=n.join("/"),a=("/"===(a+=t||(/^data\:|^blob\:|\?/.test(a)||i?"":".js")).charAt(0)||a.match(/^[\w\+\.\-]+:/)?"":g.baseUrl)+a}return g.urlArgs&&!/^blob\:/.test(a)?a+g.urlArgs(e,a):a},load:function(e,t){req.load(l,e,t)},execCb:function(e,t,i,r){return t.apply(r,i)},onScriptLoad:function(e){if("load"===e.type||readyRegExp.test((e.currentTarget||e.srcElement).readyState)){interactiveScript=null;var t=s(e);l.completeLoad(t.id)}},onScriptError:function(e){var i=s(e);if(!w(i.id)){var r=[];return eachProp(p,function(e,t){0!==t.indexOf("_@r")&&each(e.depMaps,function(e){if(e.id===i.id)return r.push(t),!0})}),O(makeError("scripterror",'Script error for "'+i.id+(r.length?'", needed by: '+r.join(", "):'"'),e,[i.id]))}}}).require=l.makeRequire(),l}function getInteractiveScript(){return interactiveScript&&"interactive"===interactiveScript.readyState||eachReverse(scripts(),function(e){if("interactive"===e.readyState)return interactiveScript=e}),interactiveScript}}(this,"undefined"==typeof setTimeout?void 0:setTimeout);
import pandas as pd
import numpy as np
import sys
sys.path.append("/home/me/mypy")
red = 'datasets/winequality-red.csv'
white = 'datasets/winequality-white.csv'
import plotly.express as px
def plot_occurrences(df,title="",group_keys=['quality'],separate_color=False):
if separate_color:
color= group_keys[1]
df_count= df.groupby(by=group_keys,as_index=False).count()
df_count = df_count.rename(columns = {'pH':'count'})
if separate_color:
fig = px.bar(df_count, x='quality', y='count', barmode='group', color=color,title=title)
else:
fig = px.bar(df_count, x='quality', y='count', title=title)
fig.show()
df_red = pd.read_csv(red,sep=";")
df_white = pd.read_csv(white,sep=";")
df_red['type'] = np.full(df_red.shape[0],"red")
df_white['type'] = np.full(df_white.shape[0],"white")
df_all = pd.concat([df_red,df_white])
plot_occurrences(df_all,group_keys=['quality','type'],
title="Count for Red and white",
separate_color=True)
plot_occurrences(df_all,title="Count Both merged")
The output labels range from 3 to 9. From the description that was originally given with the dataset the label shoul've contained values from 0 to 10.
9 has only 5 values
3 has 30 values
sorted(df_all['quality'].unique())
[3, 4, 5, 6, 7, 8, 9]
The number of occurences for the label 9 is so small (only 5 of them) that it is impossible to accurately use this in a prediction, therefore I opt to merge the labels 8 and 9 becoming therefore 8 or higher
merge_dict = {9:8}
df_all = df_all.applymap(lambda x: merge_dict[x] if x in merge_dict else x)
plot_occurrences(df_all,title="Merged 8 with 9")
dummies = pd.get_dummies(df_all.type)
df_all = df_all.drop(columns=['type'])
df_all = pd.concat([df_all,dummies],axis=1)
T-sne transform a high dimensional dataset into two dimensions while trying to preserve the concept of distance in the high dimensional space. If the points were far away in the high dimensional representation they should also be far away in the t-sne projection.
In this case, this high dimensional space doesn't seem to be particularly informative for the classification task.
from sklearn.manifold import TSNE
import plotly.express as px
features = df_all.drop(columns=['quality'])
tsne = TSNE(n_components=2, random_state=0)
projections = tsne.fit_transform(features)
fig = px.scatter(
projections, x=0, y=1,
color=df_all.quality, labels={'color': 'species'}
)
fig.show()
We stratify in order to train on data that is most similar to the data on which the dataset needs to predict.
from sklearn.model_selection import train_test_split
X = features
y = df_all["quality"]
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.33, random_state=42, stratify=y)
df_tr = pd.concat([X_train,y_train],axis=1)
df_te = pd.concat([X_test,y_test],axis=1)
df_tr['split'] = "training"
df_te['split'] = "test"
df_splits = pd.concat([df_tr,df_te],axis=0)
plot_occurrences(df_splits,
group_keys=['quality','split'],
separate_color=True,
title="Training/Test distribution")
When we have a set of training set examples $S = (\textbf{x}_1,y_1),...,(\textbf{x}_m,y_m)$ where $\textbf{x}_i = R^d$ and $y_i \in \{\pm 1\}$ . We say that this set is linearly separable if there is a halfspace (the two parts into which a hyperplane divides the affine space).
$$ \forall i \in [m], y_i(\langle \textbf{w},\textbf{x}_i \rangle) > i $$
Furthermore we have that the distance between a point $\textbf{x}$ and the hyperplane defined by $(\textbf{w},b)$ where $\|w\| = 1$ is $|\langle \textbf{w,x}\rangle + b|$
The purpose of the svm is to find the hyperplane such that the distance between the hyperplane and the closest points is maximum. Intuitively this is so that the resulting hyperplane does a better job at generalizing the classification results. This can be formalized in the following way:
$$ \underset{(\textbf{w},b):\|\textbf{w}\|=1}{\mathrm{argmax}} \min_{i \in [m]} y_i (\langle \textbf{w},\textbf{x}_i + b \rangle) $$Another equivalent formulation of the problem could be
$$ (\textbf{w}_0,b_0) = \underset{(\textbf{w},b)}{\mathrm{argmin}} \ \| \textbf{w} \| ^ 2 \\ s.t \\ \forall i, y_i (\langle \textbf{w},\textbf{x}_i +b \rangle) \geq 1 $$output: $$ \hat{\textbf{w}} = \frac{\textbf{w}_0}{\| \textbf{w}_0 \|} \, \hat{b} = \frac{b_0}{\textbf{w}_0} $$
We enforce the correct separation of the labeled points in the conditions under s.t. Furthermore we force the margin to be 1, but we measure the width of the margin with the norm of $\textbf{w}$. Therefore finding the largest margin for this problem boils down to finding the minimum for $\textbf{w}$.
The input is $(\textbf{x}_1,y_1) ... (\textbf{x}_m,y_m)$
$$ \min_{\textbf{w},b,\epsilon } \left( \lambda \| \textbf{w} \| ^ 2 + \frac{1}{m}\sum_{i=1}^m \epsilon_i\right) \\ s.t \\ \forall i, y_i (\langle \textbf{w},\textbf{x}_i +b \rangle) \geq 1 - \epsilon_i $$Output: $\textbf{w}, b$
The hard margin SVM assumes that you can linearly separate all of the points in the dataset. This is a very strong assumption that can be relaxed. We add the parameter $\epsilon$ that regulates of how big can the mistakes be. Furthermore the tradeoff between the width of the margin and the error that is made when classfiying
Sometimes our dataset might not be linearly separable but it might be separable in a non linear space. The solution could be to map our original points into a non linear space and then perform an svm classification in the new space. The problem here is the mapping. Mapping points into a non linear space and then mapping them back into the original spac might be too computationally intensive, therefore we use kernels in order to obtain a way of performing classification in a non linear space.
Given some embeddint of data $\psi(x)$ into some hilbert space we define the kernel as the inner product of these points in the new mapping. $$ K(\textbf{x},\textbf{x'}) = \langle \psi(\textbf{x}), \psi(\textbf{x}) \rangle $$
This function can be thought of as a funtion that specifies similarity between two points. It turns out that we can carry out the svm classification by using only the result of the kernel function over every pair of points in the dataset.
Our svm problem can be rewritten in the following equivalent form:
$$ \min_{w} (f(\langle \textbf{w} , \psi(\textbf{x}_1)\rangle , ... , \langle \textbf{w} , \psi(\textbf{x}_m)\rangle) + R (\| \textbf{w} \|)) $$where f is an arbitrary function and R is a monotonically nondecreasing function.
Thanks to the representer theorem we have that the solution to the previous equation given that $\psi$ maps into a hilbert space is:
$$ \textbf{w} = \sum_{i=1}^m \alpha_i \psi(\textbf{x}_i) + \textbf{u} $$Thanks to this theorem we can rewrite the minimization function in the following way
$$ \langle \textbf{w} , \psi(\textbf{x}_i) \rangle = \langle \sum_j \alpha_j \psi(\textbf{x}_j). \psi(\textbf{x}_i) \rangle= \sum_j \alpha_j \langle \psi(\textbf{x}_j). \psi(\textbf{x}_i) \rangle $$$$ \| \textbf{w} \|^2 = \langle \sum_j \alpha_j \psi(\textbf{x}_j), \sum_j \alpha_j \psi(\textbf{x}_j) \rangle= \sum_{i,j=1} \alpha_i \alpha_j \langle \psi(\textbf{x}_i) , \psi(\textbf{x}_j) \rangle $$And therefore the original problem can be rewritten in this way:
$$ \min_{\alpha \in R^m} f \left( \sum_{j=1}^m \alpha_j K(\textbf{x}_j,\textbf{x}_1) ,..., \alpha_j K(\textbf{x}_j,\textbf{x}_m) \right) + R \left( \sqrt{\sum_{i,j=1}^m \alpha_i \alpha_j K(\textbf{x}_j,\textbf{x}_i) } \right) $$from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
pipe = Pipeline(steps=[('scaler', StandardScaler()),
('svm', SVC() )])
param_grid = {
'svm__C': [1,10],
'svm__gamma':[0.01,0.1,1,10],
'svm__kernel':['linear', 'rbf']
}
#Note that by default the GridSearchCv function does a stratified K-fold when
#y is either binary or multiclass and the estimator is a classifier
clf = GridSearchCV(pipe, param_grid, n_jobs=-1)
clf.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
('svm', SVC())]),
n_jobs=-1,
param_grid={'svm__C': [1, 10], 'svm__gamma': [0.01, 0.1, 1, 10],
'svm__kernel': ['linear', 'rbf']})
clf.best_estimator_
Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC(C=1, gamma=1))])
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
def scores_and_confusion(clf,X_test,y_test):
cf=confusion_matrix(y_test,clf.predict(X_test),labels=[3,4,5,6,7,8])
cf=pd.DataFrame(cf,columns=[3,4,5,6,7,8],index=[3,4,5,6,7,8])
print(f'The best estimator is {clf.best_estimator_}')
print(f'The accuracy is {clf.score(X_test,y_test)}')
fig = px.imshow(cf,
labels=dict(x="Predicted", y="True", color="Count"),
x=[3,4,5,6,7,8],
y=[3,4,5,6,7,8],
color_continuous_scale=px.colors.sequential.Blues,
width=800, height=500)
fig.update_xaxes(title_text='Predicted',side="top",tickmode = 'linear')
fig.update_yaxes(title_text='True')
fig.show()
#fig, ax = plt.subplots(figsize=(10,7))
#sns.heatmap(cf,linewidths=.5, cmap='Blues', annot=True, ax=ax)
scores_and_confusion(clf,X_test,y_test)
The best estimator is Pipeline(steps=[('scaler', StandardScaler()), ('svm', SVC(C=1, gamma=1))])
The accuracy is 0.6261072261072261
from sklearn.ensemble import RandomForestClassifier
forest_pipe = Pipeline(steps=[('scaler',StandardScaler())
,('forest',RandomForestClassifier()) ])
forest_grid = {
'forest__n_estimators': [100,200],
'forest__min_samples_leaf':[2,3,4,5]
}
clf_forest = GridSearchCV(forest_pipe, forest_grid, n_jobs=-1)
clf_forest.fit(X_train, y_train)
GridSearchCV(estimator=Pipeline(steps=[('scaler', StandardScaler()),
('forest', RandomForestClassifier())]),
n_jobs=-1,
param_grid={'forest__min_samples_leaf': [2, 3, 4, 5],
'forest__n_estimators': [100, 200]})
scores_and_confusion(clf_forest,X_test,y_test)
The best estimator is Pipeline(steps=[('scaler', StandardScaler()),
('forest',
RandomForestClassifier(min_samples_leaf=2, n_estimators=200))])
The accuracy is 0.6680652680652681
Instead of using the crossvalidation technique, here we use the out of bag error. In this case we can fit the model on the whole dataset since the "testing" is done on the out of bag samples for each singular tree, then the average is computed
#Same grid but with the extra parameter oob_score that instructs the
#fit method to calculate the accuracy on the out of bag samples
forest_grid_oob = {
'forest__n_estimators': [100,200],
'forest__min_samples_leaf':[2,3,4,5],
'forest__oob_score':[True]
}
clf_forest_oob = GridSearchCV(forest_pipe, forest_grid_oob, n_jobs=-1)
clf_forest_oob.fit(X_train, y_train)
oob_score = clf_forest_oob.best_estimator_.steps[1][1].oob_score_
print(f'The oob score is {oob_score}')
The oob score is 0.6652113970588235
scores_and_confusion(clf_forest_oob,X_test,y_test)
The best estimator is Pipeline(steps=[('scaler', StandardScaler()),
('forest',
RandomForestClassifier(min_samples_leaf=2, n_estimators=200,
oob_score=True))])
The accuracy is 0.6717948717948717
from pytictoc import TicToc
t = TicToc()
t.tic()
clf_forest.fit(X_train, y_train)
t.toc()
t.tic()
clf_forest_oob.fit(X_train, y_train)
t.toc()
Elapsed time is 4.357786 seconds. Elapsed time is 4.603377 seconds.
The times are almost the same, however the number of estimators in the oob version is 200 while the number of estimators in the crossvalidation version is 100.
There is a problem with this division. Some classes have so few elements that it is impossible to perform any sort of meaningful crossvalidation. therefore we need to increase the number of samples in these underrepresented labels
X,y = df_all.drop(['quality'],axis=1), df_all['quality']
y
0 5
1 5
2 5
3 6
4 5
..
4893 6
4894 5
4895 6
4896 7
4897 6
Name: quality, Length: 6497, dtype: int64
from imblearn.over_sampling import SMOTE
#K-neighbors is the number of neighbors used to construct synthetic samples
#We need to set it to 3 since it is the size of the smallest class 3
oversample = SMOTE(k_neighbors=3,sampling_strategy=0.5)
X_smote,y_smote = oversample.fit_resample(X,y)
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-42-a9681f12e783> in <module> 3 #We need to set it to 3 since it is the size of the smallest class 3 4 oversample = SMOTE(k_neighbors=3,sampling_strategy=.5) ----> 5 X_smote,y_smote = oversample.fit_resample(X,y) ~/miniconda3/lib/python3.8/site-packages/imblearn/base.py in fit_resample(self, X, y) 77 X, y, binarize_y = self._check_X_y(X, y) 78 ---> 79 self.sampling_strategy_ = check_sampling_strategy( 80 self.sampling_strategy, y, self._sampling_type 81 ) ~/miniconda3/lib/python3.8/site-packages/imblearn/utils/_validation.py in check_sampling_strategy(sampling_strategy, y, sampling_type, **kwargs) 533 return OrderedDict( 534 sorted( --> 535 _sampling_strategy_float(sampling_strategy, y, sampling_type).items() 536 ) 537 ) ~/miniconda3/lib/python3.8/site-packages/imblearn/utils/_validation.py in _sampling_strategy_float(sampling_strategy, y, sampling_type) 357 type_y = type_of_target(y) 358 if type_y != "binary": --> 359 raise ValueError( 360 '"sampling_strategy" can be a float only when the type ' 361 "of target is binary. For multi-class, use a dict." ValueError: "sampling_strategy" can be a float only when the type of target is binary. For multi-class, use a dict.
df_smote = X_smote
df_smote['quality'] = y_smote
plot_dist(df_smote,title="Distribution for smote dataset")